Load in the data that was cleaned and combined from different data sets. This work can be seen in the Appendix.
data <- read.csv("data/combined_zillow_project_data.csv")
library(tigris)
library(sf)
library(dplyr)
library(ggplot2)
Base map construction for contiguous US
# Load counties but exclude Alaska, Hawaii, and U.S. territories
us_counties_sf <- counties(cb = TRUE, class = "sf") %>%
filter(!substr(GEOID, 1, 2) %in% c("02", "15", "60", "66", "69", "72", "78"))
## Retrieving data for the year 2022
## | | | 0% | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
# Plot only the mainland U.S.
ggplot(us_counties_sf) +
geom_sf(fill = "lightgray", color = "black") +
coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) + # Crop to mainland
ggtitle("Mainland U.S. Counties (Excluding AK, HI, & Territories)") +
theme_minimal()
# Aggregate by county and calculate the average of several columns together
county_aggregated_df <- data %>%
group_by(CountyName, GEOID, StateName, lat, lng) %>%
summarise(
avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
avg_population = mean(population, na.rm = TRUE),
avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE),
)
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'StateName', 'lat'.
## You can override using the `.groups` argument.
county_aggregated_df$GEOID <- as.character(county_aggregated_df$GEOID)
# List of potential values
valid_categories <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT")
county_aggregated_df <- county_aggregated_df %>%
mutate(
GEOID = case_when(
StateName %in% valid_categories ~ str_pad(GEOID, width = max(nchar(GEOID)) + 1, side = "left", pad = "0"),
TRUE ~ GEOID # Default case (for rows not matching the condition)
)
)
map_data <- us_counties_sf %>%
left_join(county_aggregated_df, by = "GEOID")
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
ggtitle("Average Historical Rent Price (2015-2024) by County (Mainland U.S.)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_median_income), color = "black") +
scale_fill_viridis_c(name = "Average Median Income ($)") +
coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
ggtitle("Average Historical Median Income (2015-2022) by County (Mainland U.S.)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_property_taxes), color = "black") +
scale_fill_viridis_c(name = "Average Property Taxes ($)") +
coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
ggtitle("Average Historical Property Taxes (2015-2022) by County (Mainland U.S.)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_median_age), color = "black") +
scale_fill_viridis_c(name = "Average Median Age (years)") +
coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
ggtitle("Average Historical Median Age (2015-2022) by County (Mainland U.S.)") +
theme_minimal()
Examine correlations between historical rent prices and population sizes, median income, median age, and property taxes:
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_population)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_median_income)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_median_age)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_property_taxes)
The strongest correlations are between rent prices and the median income (0.43) and the property taxes (0.409)
Pick States to Zoom in On
table(data$StateName)
##
## AK AL AR AZ CA CO DC DE FL GA HI IA ID IL IN KS KY LA MA MD
## 2 54 25 49 396 64 1 9 279 112 18 29 22 118 50 25 28 35 104 67
## ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC
## 14 88 62 37 26 10 108 10 12 16 166 13 12 127 116 38 57 148 21 58
## SD TN TX UT VA VT WA WI WV WY
## 7 59 251 49 67 2 102 47 9 8
States with the most data points to zoom in on: California, Florida, and Texas
CA_data <- data %>%
filter(StateName == 'CA')
FL_data <- data %>%
filter(StateName == 'FL')
TX_data <- data %>%
filter(StateName == 'TX')
CA_county_aggregated_df <- CA_data %>%
group_by(CountyName, GEOID, lat, lng) %>%
summarise(
avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
avg_population = mean(population, na.rm = TRUE),
avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
)
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "06"
counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf")
## Retrieving data for the year 2022
CA_county_aggregated_df$GEOID <- as.character(CA_county_aggregated_df$GEOID)
CA_county_aggregated_df <- CA_county_aggregated_df %>%
mutate(GEOID = str_pad(GEOID, width = max(nchar(GEOID)) + 1, side = "left", pad = "0"))
map_data <- counties_sf %>%
left_join(CA_county_aggregated_df, by = "GEOID")
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Historical Rent Price (2015-2025) by County (Mainland U.S.)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_2024_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Rent Price in 2024 by County (California)") +
theme_minimal()
FL_county_aggregated_df <- FL_data %>%
group_by(CountyName, GEOID, lat, lng) %>%
summarise(
avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
avg_population = mean(population, na.rm = TRUE),
avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
)
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "12"
counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf")
## Retrieving data for the year 2022
FL_county_aggregated_df$GEOID <- as.character(FL_county_aggregated_df$GEOID)
map_data <- counties_sf %>%
left_join(FL_county_aggregated_df, by = "GEOID")
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Historical Rent Price (2015-2024) by County (Florida)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_2024_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Rent Price in 2024 by County (Florida)") +
theme_minimal()
TX_county_aggregated_df <- TX_data %>%
group_by(CountyName, GEOID, lat, lng) %>%
summarise(
avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
avg_population = mean(population, na.rm = TRUE),
avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
)
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "48"
counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf")
## Retrieving data for the year 2022
TX_county_aggregated_df$GEOID <- as.character(TX_county_aggregated_df$GEOID)
map_data <- counties_sf %>%
left_join(TX_county_aggregated_df, by = "GEOID")
ggplot(map_data) +
geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Historical Rent Price (2015-2024) by County (Texas)") +
theme_minimal()
ggplot(map_data) +
geom_sf(aes(fill = avg_2024_rent), color = "black") +
scale_fill_viridis_c(name = "Average Rent ($)") +
ggtitle("Average Rent Price in 2024 by County (Texas)") +
theme_minimal()
First, make a linear regression model just using the past years rent to see how the model performs in predicting the 2024 rent price for each county.
data$avg_rent_2024 <- rowMeans(data[, c('X2024.01.31', 'X2024.02.29', 'X2024.03.31', 'X2024.04.30', 'X2024.05.31', 'X2024.06.30', 'X2024.07.31', 'X2024.08.31', 'X2024.09.30', 'X2024.10.31', 'X2024.11.30', 'X2024.12.31')], na.rm = TRUE)
data$avg_rent_2023 <- rowMeans(data[, c('X2023.01.31', 'X2023.02.28', 'X2023.03.31', 'X2023.04.30', 'X2023.05.31', 'X2023.06.30', 'X2023.07.31', 'X2023.08.31', 'X2023.09.30', 'X2023.10.31', 'X2023.11.30', 'X2023.12.31')], na.rm = TRUE)
data$avg_rent_2022 <- rowMeans(data[, c('X2022.01.31', 'X2022.02.28', 'X2022.03.31', 'X2022.04.30', 'X2022.05.31', 'X2022.06.30', 'X2022.07.31', 'X2022.08.31', 'X2022.09.30', 'X2022.10.31', 'X2022.11.30', 'X2022.12.31')], na.rm = TRUE)
data$avg_rent_2021 <- rowMeans(data[, c('X2021.01.31', 'X2021.02.28', 'X2021.03.31', 'X2021.04.30', 'X2021.05.31', 'X2021.06.30', 'X2021.07.31', 'X2021.08.31', 'X2021.09.30', 'X2021.10.31', 'X2021.11.30', 'X2021.12.31')], na.rm = TRUE)
data$avg_rent_2020 <- rowMeans(data[, c('X2020.01.31', 'X2020.02.29', 'X2020.03.31', 'X2020.04.30', 'X2020.05.31', 'X2020.06.30', 'X2020.07.31', 'X2020.08.31', 'X2020.09.30', 'X2020.10.31', 'X2020.11.30', 'X2020.12.31')], na.rm = TRUE)
data$avg_rent_2019 <- rowMeans(data[, c('X2019.01.31', 'X2019.02.28', 'X2019.03.31', 'X2019.04.30', 'X2019.05.31', 'X2019.06.30', 'X2019.07.31', 'X2019.08.31', 'X2019.09.30', 'X2019.10.31', 'X2019.11.30', 'X2019.12.31')], na.rm = TRUE)
data$avg_rent_2018 <- rowMeans(data[, c('X2018.01.31', 'X2018.02.28', 'X2018.03.31', 'X2018.04.30', 'X2018.05.31', 'X2018.06.30', 'X2018.07.31', 'X2018.08.31', 'X2018.09.30', 'X2018.10.31', 'X2018.11.30', 'X2018.12.31')], na.rm = TRUE)
data$avg_rent_2017 <- rowMeans(data[, c('X2017.01.31', 'X2017.02.28', 'X2017.03.31', 'X2017.04.30', 'X2017.05.31', 'X2017.06.30', 'X2017.07.31', 'X2017.08.31', 'X2017.09.30', 'X2017.10.31', 'X2017.11.30', 'X2017.12.31')], na.rm = TRUE)
data$avg_rent_2016 <- rowMeans(data[, c('X2016.01.31', 'X2016.02.29', 'X2016.03.31', 'X2016.04.30', 'X2016.05.31', 'X2016.06.30', 'X2016.07.31', 'X2016.08.31', 'X2016.09.30', 'X2016.10.31', 'X2016.11.30', 'X2016.12.31')], na.rm = TRUE)
data$avg_rent_2015 <- rowMeans(data[, c('X2015.01.31', 'X2015.02.28', 'X2015.03.31', 'X2015.04.30', 'X2015.05.31', 'X2015.06.30', 'X2015.07.31', 'X2015.08.31', 'X2015.09.30', 'X2015.10.31', 'X2015.11.30', 'X2015.12.31')], na.rm = TRUE)
Make a new data frame selecting only the columns we want for modeling
data_modeling <- data[, c('avg_rent_2024', 'avg_rent_2023', 'avg_rent_2022', 'avg_rent_2021', 'avg_rent_2020', 'avg_rent_2019', 'avg_rent_2018', 'avg_rent_2017', 'avg_rent_2016', 'avg_rent_2015', 'population', 'property_taxes.x_2022', 'property_taxes.x_2021', 'property_taxes.x_2020', 'property_taxes.x_2019', 'property_taxes.x_2018', 'property_taxes.x_2017', 'property_taxes.x_2016', 'property_taxes.x_2015', 'median_income_2022', 'median_income_2021', 'median_income_2020', 'median_income_2019', 'median_income_2018', 'median_income_2017', 'median_income_2016', 'median_income_2015', 'county_state', 'StateName')]
Save the factors as needed.
data_modeling$county_state <- as.factor(data_modeling$county_state)
data_modeling$StateName <- as.factor(data_modeling$StateName)
Now we model! (Start with a linear model with all factors included)
lm_model_full <- lm(avg_rent_2024 ~ . -(county_state), data = data_modeling)
summary(lm_model_full)
##
## Call:
## lm(formula = avg_rent_2024 ~ . - (county_state), data = data_modeling)
##
## Residuals:
## Min 1Q Median 3Q Max
## -92.95 -14.73 0.33 14.57 98.84
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.80e+01 1.37e+01 -2.04 0.04221 *
## avg_rent_2023 1.26e+00 3.53e-02 35.67 < 2e-16 ***
## avg_rent_2022 -2.90e-01 4.86e-02 -5.97 4.2e-09 ***
## avg_rent_2021 -4.11e-02 4.19e-02 -0.98 0.32669
## avg_rent_2020 4.71e-01 7.67e-02 6.14 1.5e-09 ***
## avg_rent_2019 -2.28e-01 8.24e-02 -2.77 0.00579 **
## avg_rent_2018 -2.00e-01 9.95e-02 -2.01 0.04484 *
## avg_rent_2017 2.71e-01 1.32e-01 2.06 0.04019 *
## avg_rent_2016 -4.71e-01 1.22e-01 -3.87 0.00012 ***
## avg_rent_2015 2.75e-01 6.14e-02 4.48 9.2e-06 ***
## population -3.46e-06 8.23e-07 -4.20 3.0e-05 ***
## property_taxes.x_2022 3.96e-02 2.34e-02 1.70 0.09024 .
## property_taxes.x_2021 1.65e-02 4.82e-02 0.34 0.73283
## property_taxes.x_2020 -9.40e-02 4.80e-02 -1.96 0.05061 .
## property_taxes.x_2019 -9.09e-02 5.71e-02 -1.59 0.11167
## property_taxes.x_2018 1.23e-01 7.44e-02 1.65 0.09897 .
## property_taxes.x_2017 1.41e-02 8.88e-02 0.16 0.87407
## property_taxes.x_2016 -1.71e-04 9.28e-02 0.00 0.99853
## property_taxes.x_2015 -4.67e-03 4.99e-02 -0.09 0.92539
## median_income_2022 -2.40e-03 1.63e-03 -1.47 0.14221
## median_income_2021 3.57e-03 2.64e-03 1.35 0.17695
## median_income_2020 -3.75e-05 2.29e-03 -0.02 0.98694
## median_income_2019 -4.61e-03 2.38e-03 -1.94 0.05334 .
## median_income_2018 1.05e-03 2.79e-03 0.38 0.70525
## median_income_2017 2.20e-03 3.09e-03 0.71 0.47699
## median_income_2016 -4.24e-03 3.24e-03 -1.31 0.19057
## median_income_2015 4.83e-03 2.17e-03 2.23 0.02634 *
## StateNameAR 1.57e+01 1.76e+01 0.89 0.37265
## StateNameAZ -1.43e+01 1.36e+01 -1.05 0.29223
## StateNameCA 2.26e+01 1.35e+01 1.67 0.09454 .
## StateNameCO 7.19e+00 1.33e+01 0.54 0.58813
## StateNameDC 2.01e+01 3.07e+01 0.66 0.51205
## StateNameDE 4.31e+01 2.23e+01 1.94 0.05322 .
## StateNameFL -3.38e+00 1.24e+01 -0.27 0.78519
## StateNameGA 1.01e+00 1.24e+01 0.08 0.93460
## StateNameHI 5.02e+01 1.75e+01 2.87 0.00428 **
## StateNameIA -9.68e+00 2.28e+01 -0.42 0.67102
## StateNameID 1.39e+01 1.82e+01 0.76 0.44651
## StateNameIL 3.55e+01 1.62e+01 2.19 0.02890 *
## StateNameIN 2.37e+01 1.44e+01 1.65 0.09926 .
## StateNameKS 2.83e+01 1.61e+01 1.76 0.07926 .
## StateNameKY 3.57e+01 2.22e+01 1.61 0.10881
## StateNameLA 2.87e+00 1.81e+01 0.16 0.87395
## StateNameMA 2.00e+01 1.68e+01 1.19 0.23325
## StateNameMD 1.68e+00 1.51e+01 0.11 0.91118
## StateNameME 1.40e+01 3.00e+01 0.46 0.64213
## StateNameMI 2.71e+01 1.41e+01 1.92 0.05524 .
## StateNameMN -1.04e+01 1.72e+01 -0.60 0.54777
## StateNameMO 2.32e+01 1.60e+01 1.45 0.14883
## StateNameMT 9.11e+00 2.29e+01 0.40 0.69118
## StateNameNC -4.03e+00 1.27e+01 -0.32 0.75052
## StateNameNE 6.77e+00 2.30e+01 0.29 0.76902
## StateNameNH 3.95e-01 3.13e+01 0.01 0.98994
## StateNameNJ 7.99e+00 2.27e+01 0.35 0.72529
## StateNameNM 8.84e+00 2.27e+01 0.39 0.69713
## StateNameNV 2.02e+01 1.76e+01 1.15 0.25171
## StateNameNY -6.73e+00 1.93e+01 -0.35 0.72779
## StateNameOH 3.22e+01 1.49e+01 2.16 0.03132 *
## StateNameOK 1.51e+01 1.45e+01 1.04 0.29935
## StateNameOR 1.48e+01 1.64e+01 0.90 0.36627
## StateNamePA 9.12e+00 1.56e+01 0.59 0.55849
## StateNameRI 5.91e+01 2.43e+01 2.43 0.01551 *
## StateNameSC 1.88e+01 1.35e+01 1.39 0.16371
## StateNameTN -2.58e+00 1.45e+01 -0.18 0.85869
## StateNameTX -1.76e+01 1.35e+01 -1.30 0.19430
## StateNameUT 9.84e+00 1.95e+01 0.50 0.61401
## StateNameVA 3.41e+01 1.39e+01 2.46 0.01418 *
## StateNameWA 2.60e+01 1.44e+01 1.80 0.07199 .
## StateNameWI 4.96e+00 2.41e+01 0.21 0.83670
## StateNameWY 1.71e+01 2.24e+01 0.76 0.44633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.8 on 586 degrees of freedom
## (2571 observations deleted due to missingness)
## Multiple R-squared: 0.999, Adjusted R-squared: 0.999
## F-statistic: 9.34e+03 on 69 and 586 DF, p-value: <2e-16
Now use backwards selection.
# Use backwards selection
bwd_model <- step(lm_model_full, direction = "backward", trace = 0)
anova(bwd_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.50e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 3.15e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 1.04e+02 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.86e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.62e+00 0.10581
## avg_rent_2017 1 9.02e+03 9.02e+03 1.27e+01 0.00040 ***
## avg_rent_2016 1 2.27e+02 2.27e+02 3.20e-01 0.57191
## avg_rent_2015 1 4.51e+04 4.51e+04 6.34e+01 8.4e-15 ***
## population 1 2.23e+04 2.23e+04 3.14e+01 3.2e-08 ***
## property_taxes.x_2022 1 2.10e+01 2.10e+01 3.00e-02 0.86317
## property_taxes.x_2020 1 1.07e+04 1.07e+04 1.50e+01 0.00012 ***
## property_taxes.x_2019 1 3.75e+04 3.75e+04 5.28e+01 1.1e-12 ***
## property_taxes.x_2018 1 2.81e+04 2.81e+04 3.96e+01 6.1e-10 ***
## median_income_2022 1 1.40e+03 1.40e+03 1.96e+00 0.16166
## median_income_2021 1 8.47e+03 8.47e+03 1.19e+01 0.00060 ***
## median_income_2019 1 1.77e+03 1.77e+03 2.49e+00 0.11511
## median_income_2015 1 1.46e+04 1.46e+04 2.05e+01 7.2e-06 ***
## StateName 43 1.24e+05 2.89e+03 4.07e+00 3.8e-15 ***
## Residuals 595 4.23e+05 7.11e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now try removing factors one by one starting with the largest p-value each time.
new_model <- update(bwd_model, .~. -property_taxes.x_2022)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.35e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 3.08e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 1.01e+02 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.82e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.57e+00 0.10976
## avg_rent_2017 1 9.02e+03 9.02e+03 1.24e+01 0.00046 ***
## avg_rent_2016 1 2.27e+02 2.27e+02 3.10e-01 0.57621
## avg_rent_2015 1 4.51e+04 4.51e+04 6.20e+01 1.6e-14 ***
## population 1 2.23e+04 2.23e+04 3.07e+01 4.5e-08 ***
## property_taxes.x_2020 1 2.67e+02 2.67e+02 3.70e-01 0.54478
## property_taxes.x_2019 1 4.43e+04 4.43e+04 6.10e+01 2.6e-14 ***
## property_taxes.x_2018 1 1.39e+04 1.39e+04 1.91e+01 1.4e-05 ***
## median_income_2022 1 2.20e+03 2.20e+03 3.02e+00 0.08257 .
## median_income_2021 1 7.53e+03 7.53e+03 1.04e+01 0.00135 **
## median_income_2019 1 3.13e+03 3.13e+03 4.30e+00 0.03850 *
## median_income_2015 1 1.05e+04 1.05e+04 1.45e+01 0.00015 ***
## StateName 43 1.35e+05 3.13e+03 4.31e+00 < 2e-16 ***
## Residuals 596 4.33e+05 7.27e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(bwd_model, .~. -avg_rent_2016)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.36e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 3.09e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 1.02e+02 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.82e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.57e+00 0.10956
## avg_rent_2017 1 9.02e+03 9.02e+03 1.24e+01 0.00046 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.49e+01 0.00013 ***
## population 1 1.89e+04 1.89e+04 2.60e+01 4.6e-07 ***
## property_taxes.x_2022 1 5.00e+01 5.00e+01 7.00e-02 0.79288
## property_taxes.x_2020 1 1.06e+04 1.06e+04 1.45e+01 0.00015 ***
## property_taxes.x_2019 1 3.86e+04 3.86e+04 5.32e+01 9.6e-13 ***
## property_taxes.x_2018 1 3.32e+04 3.32e+04 4.57e+01 3.3e-11 ***
## median_income_2022 1 3.50e+03 3.50e+03 4.82e+00 0.02857 *
## median_income_2021 1 8.46e+03 8.46e+03 1.17e+01 0.00068 ***
## median_income_2019 1 1.83e+03 1.83e+03 2.53e+00 0.11256
## median_income_2015 1 2.05e+04 2.05e+04 2.82e+01 1.6e-07 ***
## StateName 43 1.38e+05 3.22e+03 4.43e+00 < 2e-16 ***
## Residuals 596 4.33e+05 7.26e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2022)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.20e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 3.01e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.90e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.77e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.51e+00 0.11398
## avg_rent_2017 1 9.02e+03 9.02e+03 1.21e+01 0.00054 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.45e+01 0.00015 ***
## population 1 1.89e+04 1.89e+04 2.54e+01 6.3e-07 ***
## property_taxes.x_2020 1 3.51e+02 3.51e+02 4.70e-01 0.49261
## property_taxes.x_2019 1 4.49e+04 4.49e+04 6.04e+01 3.5e-14 ***
## property_taxes.x_2018 1 1.68e+04 1.68e+04 2.26e+01 2.6e-06 ***
## median_income_2022 1 4.90e+03 4.90e+03 6.59e+00 0.01052 *
## median_income_2021 1 7.48e+03 7.48e+03 1.01e+01 0.00160 **
## median_income_2019 1 3.30e+03 3.30e+03 4.43e+00 0.03570 *
## median_income_2015 1 1.56e+04 1.56e+04 2.09e+01 5.7e-06 ***
## StateName 43 1.50e+05 3.49e+03 4.69e+00 < 2e-16 ***
## Residuals 597 4.44e+05 7.44e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2020)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.21e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 3.01e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.92e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.78e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.51e+00 0.11374
## avg_rent_2017 1 9.02e+03 9.02e+03 1.21e+01 0.00053 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.45e+01 0.00015 ***
## population 1 1.89e+04 1.89e+04 2.54e+01 6.2e-07 ***
## property_taxes.x_2019 1 9.26e+02 9.26e+02 1.25e+00 0.26471
## property_taxes.x_2018 1 5.97e+04 5.97e+04 8.04e+01 < 2e-16 ***
## median_income_2022 1 4.75e+03 4.75e+03 6.40e+00 0.01169 *
## median_income_2021 1 8.12e+03 8.12e+03 1.09e+01 0.00100 **
## median_income_2019 1 2.97e+03 2.97e+03 3.99e+00 0.04613 *
## median_income_2015 1 1.61e+04 1.61e+04 2.17e+01 4.0e-06 ***
## StateName 43 1.51e+05 3.51e+03 4.72e+00 < 2e-16 ***
## Residuals 598 4.44e+05 7.43e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2019)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.06e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.94e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.68e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.73e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.45e+00 0.11815
## avg_rent_2017 1 9.02e+03 9.02e+03 1.18e+01 0.00062 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.42e+01 0.00018 ***
## population 1 1.89e+04 1.89e+04 2.48e+01 8.4e-07 ***
## property_taxes.x_2018 1 1.96e+03 1.96e+03 2.58e+00 0.10882
## median_income_2022 1 2.12e+02 2.12e+02 2.80e-01 0.59782
## median_income_2021 1 8.36e+03 8.36e+03 1.10e+01 0.00098 ***
## median_income_2019 1 2.32e+03 2.32e+03 3.04e+00 0.08152 .
## median_income_2015 1 3.48e+04 3.48e+04 4.57e+01 3.3e-11 ***
## StateName 43 1.84e+05 4.28e+03 5.62e+00 < 2e-16 ***
## Residuals 599 4.56e+05 7.61e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2022)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 6.05e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.94e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.66e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.73e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.44e+00 0.11850
## avg_rent_2017 1 9.02e+03 9.02e+03 1.18e+01 0.00063 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.42e+01 0.00019 ***
## population 1 1.89e+04 1.89e+04 2.48e+01 8.6e-07 ***
## property_taxes.x_2018 1 1.96e+03 1.96e+03 2.57e+00 0.10916
## median_income_2021 1 4.07e+02 4.07e+02 5.30e-01 0.46564
## median_income_2019 1 2.00e+00 2.00e+00 0.00e+00 0.96186
## median_income_2015 1 3.93e+04 3.93e+04 5.15e+01 2.1e-12 ***
## StateName 43 1.88e+05 4.38e+03 5.74e+00 < 2e-16 ***
## Residuals 600 4.58e+05 7.63e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2019)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.96e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.89e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.52e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.71e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.41e+00 0.12129
## avg_rent_2017 1 9.02e+03 9.02e+03 1.16e+01 0.00069 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.39e+01 0.00021 ***
## population 1 1.89e+04 1.89e+04 2.44e+01 1.0e-06 ***
## property_taxes.x_2018 1 1.96e+03 1.96e+03 2.54e+00 0.11184
## median_income_2021 1 4.07e+02 4.07e+02 5.20e-01 0.46900
## median_income_2015 1 1.70e+04 1.70e+04 2.19e+01 3.5e-06 ***
## StateName 43 2.03e+05 4.72e+03 6.09e+00 < 2e-16 ***
## Residuals 601 4.65e+05 7.74e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2021)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.82e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.82e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.28e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.66e+02 < 2e-16 ***
## avg_rent_2018 1 1.86e+03 1.86e+03 2.35e+00 0.12589
## avg_rent_2017 1 9.02e+03 9.02e+03 1.14e+01 0.00080 ***
## avg_rent_2015 1 1.08e+04 1.08e+04 1.36e+01 0.00025 ***
## population 1 1.89e+04 1.89e+04 2.38e+01 1.4e-06 ***
## property_taxes.x_2018 1 1.96e+03 1.96e+03 2.47e+00 0.11627
## median_income_2015 1 2.72e+03 2.72e+03 3.42e+00 0.06472 .
## StateName 43 2.05e+05 4.77e+03 6.01e+00 < 2e-16 ***
## Residuals 602 4.78e+05 7.94e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -avg_rent_2018)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.79e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.81e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.25e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.66e+02 < 2e-16 ***
## avg_rent_2017 1 1.88e+02 1.88e+02 2.40e-01 0.628
## avg_rent_2015 1 1.75e+04 1.75e+04 2.19e+01 3.5e-06 ***
## population 1 2.07e+04 2.07e+04 2.60e+01 4.6e-07 ***
## property_taxes.x_2018 1 2.01e+03 2.01e+03 2.53e+00 0.112
## median_income_2015 1 2.54e+03 2.54e+03 3.19e+00 0.075 .
## StateName 43 2.05e+05 4.77e+03 5.98e+00 < 2e-16 ***
## Residuals 603 4.80e+05 7.97e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -avg_rent_2017)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.67e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.05e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015 1 6.64e+03 6.64e+03 8.16e+00 0.0044 **
## population 1 2.44e+04 2.44e+04 2.99e+01 6.5e-08 ***
## property_taxes.x_2018 1 8.70e+02 8.70e+02 1.07e+00 0.3015
## median_income_2015 1 2.70e+03 2.70e+03 3.31e+00 0.0693 .
## StateName 43 2.02e+05 4.70e+03 5.78e+00 < 2e-16 ***
## Residuals 604 4.92e+05 8.14e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2018)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.67e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.05e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015 1 6.64e+03 6.64e+03 8.15e+00 0.0044 **
## population 1 2.44e+04 2.44e+04 2.99e+01 6.6e-08 ***
## median_income_2015 1 3.56e+03 3.56e+03 4.37e+00 0.0369 *
## StateName 43 2.01e+05 4.67e+03 5.73e+00 < 2e-16 ***
## Residuals 605 4.93e+05 8.15e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here, we decide to take median_income_2015 out becaue even though it has significance at the 0.05 level it does not make logical sense for this to contribute meaningfully when median incomes from other more recent years did not.
new_model <- update(new_model, .~. -median_income_2015)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 4.62e+08 4.62e+08 5.66e+05 < 2e-16 ***
## avg_rent_2022 1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020 1 7.37e+04 7.37e+04 9.04e+01 < 2e-16 ***
## avg_rent_2019 1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015 1 6.64e+03 6.64e+03 8.15e+00 0.0045 **
## population 1 2.44e+04 2.44e+04 2.99e+01 6.7e-08 ***
## StateName 43 2.03e+05 4.73e+03 5.80e+00 < 2e-16 ***
## Residuals 606 4.94e+05 8.15e+02
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here, again even though avg_rent_2015 is significant it does not make the most sense given more recent years were taken out so we will remove it.
new_model <- update(new_model, .~. -avg_rent_2015)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 3.78e+09 3.78e+09 1.18e+06 <2e-16 ***
## avg_rent_2022 1 1.65e+06 1.65e+06 5.17e+02 <2e-16 ***
## avg_rent_2020 1 7.90e+05 7.90e+05 2.47e+02 <2e-16 ***
## avg_rent_2019 1 1.27e+06 1.27e+06 3.97e+02 <2e-16 ***
## population 1 2.66e+05 2.66e+05 8.33e+01 <2e-16 ***
## StateName 48 1.64e+06 3.42e+04 1.07e+01 <2e-16 ***
## Residuals 982 3.14e+06 3.20e+03
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Here all factors are significant at the 0.05 level and make sense to be contributors. The fact that the avg_rent_2021 was taken out of the model could make sense due to COVID having an impact on what renters were leasing at the following year once rent freezes were no longer in effect in the wake of the immediate COVID crisis.
final_US_model <- new_model
anova(final_US_model)
## Analysis of Variance Table
##
## Response: avg_rent_2024
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_rent_2023 1 3.78e+09 3.78e+09 1.18e+06 <2e-16 ***
## avg_rent_2022 1 1.65e+06 1.65e+06 5.17e+02 <2e-16 ***
## avg_rent_2020 1 7.90e+05 7.90e+05 2.47e+02 <2e-16 ***
## avg_rent_2019 1 1.27e+06 1.27e+06 3.97e+02 <2e-16 ***
## population 1 2.66e+05 2.66e+05 8.33e+01 <2e-16 ***
## StateName 48 1.64e+06 3.42e+04 1.07e+01 <2e-16 ***
## Residuals 982 3.14e+06 3.20e+03
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Intepretation:
# First, drop NA's
county_aggregated_df_clean <- drop_na(county_aggregated_df)
fit_rent_only <- lm(avg_2024_rent ~ avg_2023_rent + avg_2022_rent + avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + avg_2017_rent + avg_2016_rent + avg_2015_rent, data = county_aggregated_df_clean)
summary(fit_rent_only)
##
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent +
## avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent +
## avg_2017_rent + avg_2016_rent + avg_2015_rent, data = county_aggregated_df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -214.6 -27.9 -8.7 15.0 383.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.56680 15.45348 1.01 0.31
## avg_2023_rent 1.35742 0.07030 19.31 <2e-16 ***
## avg_2022_rent -0.48856 0.09720 -5.03 9e-07 ***
## avg_2021_rent 0.11540 0.09352 1.23 0.22
## avg_2020_rent 0.18077 0.20097 0.90 0.37
## avg_2019_rent -0.17004 0.19494 -0.87 0.38
## avg_2018_rent 0.04028 0.19740 0.20 0.84
## avg_2017_rent -0.01551 0.17884 -0.09 0.93
## avg_2016_rent -0.00901 0.12139 -0.07 0.94
## avg_2015_rent 0.03256 0.07022 0.46 0.64
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.4 on 276 degrees of freedom
## Multiple R-squared: 0.986, Adjusted R-squared: 0.986
## F-statistic: 2.22e+03 on 9 and 276 DF, p-value: <2e-16
Now use backwards selection to choose variables
# Use backwards selection
bwd_rent_only_model <- step(fit_rent_only, direction = "backward", trace = 0)
summary(bwd_rent_only_model)
##
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent +
## avg_2021_rent, data = county_aggregated_df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -204.9 -28.8 -9.7 16.4 376.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.5348 13.4647 1.45 0.15
## avg_2023_rent 1.3743 0.0677 20.29 < 2e-16 ***
## avg_2022_rent -0.5265 0.0892 -5.90 1.0e-08 ***
## avg_2021_rent 0.1894 0.0444 4.27 2.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60.9 on 282 degrees of freedom
## Multiple R-squared: 0.986, Adjusted R-squared: 0.986
## F-statistic: 6.76e+03 on 3 and 282 DF, p-value: <2e-16
Let’s do the same thing using Lasso.
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(coefplot)
## get X matrix and y
X <- as.matrix(county_aggregated_df_clean[, c('avg_2023_rent', 'avg_2022_rent', 'avg_2021_rent', 'avg_2020_rent', 'avg_2019_rent', 'avg_2018_rent', 'avg_2017_rent', 'avg_2016_rent', 'avg_2015_rent')]) # Exclude target, FIPS, and County columns
y <- county_aggregated_df_clean$avg_2024_rent # Assuming target is stored in `target_variable`
cv_fit <- cv.glmnet(X, y, alpha = 1)
plot(cv_fit)
Now we will add other variables to the mix.
fit_all_info <- lm(avg_2024_rent ~ avg_2023_rent + avg_2022_rent + avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + avg_2017_rent + avg_2016_rent + avg_2015_rent + avg_2015_median_income + avg_2016_median_income + avg_2016_median_income + avg_2017_median_income + avg_2018_median_income + avg_2019_median_income + avg_2020_median_income + avg_2021_median_income + avg_2022_median_income + avg_2015_property_taxes + avg_2016_property_taxes + avg_2017_property_taxes + avg_2018_property_taxes + avg_2019_property_taxes + avg_2020_property_taxes + avg_2021_property_taxes + avg_2022_property_taxes, data = county_aggregated_df_clean)
summary(fit_all_info)
##
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent +
## avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent +
## avg_2017_rent + avg_2016_rent + avg_2015_rent + avg_2015_median_income +
## avg_2016_median_income + avg_2016_median_income + avg_2017_median_income +
## avg_2018_median_income + avg_2019_median_income + avg_2020_median_income +
## avg_2021_median_income + avg_2022_median_income + avg_2015_property_taxes +
## avg_2016_property_taxes + avg_2017_property_taxes + avg_2018_property_taxes +
## avg_2019_property_taxes + avg_2020_property_taxes + avg_2021_property_taxes +
## avg_2022_property_taxes, data = county_aggregated_df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -204.0 -26.1 -6.9 15.6 389.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.31678 18.49975 0.34 0.73304
## avg_2023_rent 1.23267 0.07355 16.76 < 2e-16 ***
## avg_2022_rent -0.34017 0.10201 -3.33 0.00098 ***
## avg_2021_rent 0.15020 0.09664 1.55 0.12134
## avg_2020_rent -0.10523 0.20607 -0.51 0.61003
## avg_2019_rent 0.10374 0.19773 0.52 0.60029
## avg_2018_rent -0.08393 0.19826 -0.42 0.67241
## avg_2017_rent 0.16437 0.18768 0.88 0.38196
## avg_2016_rent -0.03120 0.12945 -0.24 0.80971
## avg_2015_rent -0.04574 0.07630 -0.60 0.54939
## avg_2015_median_income 0.00387 0.00513 0.75 0.45121
## avg_2016_median_income 0.00177 0.00809 0.22 0.82693
## avg_2017_median_income -0.00744 0.00829 -0.90 0.37048
## avg_2018_median_income 0.00507 0.00709 0.71 0.47538
## avg_2019_median_income -0.00356 0.00580 -0.61 0.53956
## avg_2020_median_income -0.00762 0.00572 -1.33 0.18427
## avg_2021_median_income 0.01485 0.00628 2.37 0.01876 *
## avg_2022_median_income -0.00637 0.00410 -1.55 0.12186
## avg_2015_property_taxes -0.04285 0.13854 -0.31 0.75733
## avg_2016_property_taxes 0.19511 0.25707 0.76 0.44856
## avg_2017_property_taxes -0.14826 0.23841 -0.62 0.53456
## avg_2018_property_taxes 0.07925 0.20797 0.38 0.70348
## avg_2019_property_taxes -0.04094 0.15407 -0.27 0.79065
## avg_2020_property_taxes 0.04837 0.13243 0.37 0.71520
## avg_2021_property_taxes -0.09172 0.13513 -0.68 0.49791
## avg_2022_property_taxes 0.01286 0.06411 0.20 0.84116
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.1 on 260 degrees of freedom
## Multiple R-squared: 0.988, Adjusted R-squared: 0.987
## F-statistic: 862 on 25 and 260 DF, p-value: <2e-16
Now run backwards selection again
# Use backwards selection
bwd_all_info_model <- step(fit_all_info, direction = "backward", trace = 0)
summary(bwd_all_info_model)
##
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent +
## avg_2021_rent + avg_2020_median_income + avg_2021_median_income +
## avg_2022_median_income + avg_2016_property_taxes + avg_2021_property_taxes,
## data = county_aggregated_df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -204.7 -27.0 -5.2 13.9 397.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.03080 15.82493 0.57 0.56869
## avg_2023_rent 1.22629 0.07024 17.46 < 2e-16 ***
## avg_2022_rent -0.31949 0.09297 -3.44 0.00068 ***
## avg_2021_rent 0.12840 0.04641 2.77 0.00605 **
## avg_2020_median_income -0.00710 0.00364 -1.95 0.05242 .
## avg_2021_median_income 0.01410 0.00575 2.45 0.01487 *
## avg_2022_median_income -0.00664 0.00354 -1.88 0.06138 .
## avg_2016_property_taxes 0.06211 0.01263 4.92 1.5e-06 ***
## avg_2021_property_taxes -0.04991 0.01155 -4.32 2.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.8 on 277 degrees of freedom
## Multiple R-squared: 0.988, Adjusted R-squared: 0.988
## F-statistic: 2.82e+03 on 8 and 277 DF, p-value: <2e-16
Now check this output using Anova
Take out the avg_2020_median_income
new_model <- update(bwd_all_info_model, .~. -avg_2020_median_income)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_2024_rent
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_2023_rent 1 75032525 75032525 22245.65 < 2e-16 ***
## avg_2022_rent 1 62408 62408 18.50 2.4e-05 ***
## avg_2021_rent 1 67512 67512 20.02 1.1e-05 ***
## avg_2021_median_income 1 3739 3739 1.11 0.2933
## avg_2022_median_income 1 15427 15427 4.57 0.0333 *
## avg_2016_property_taxes 1 27709 27709 8.22 0.0045 **
## avg_2021_property_taxes 1 60770 60770 18.02 3.0e-05 ***
## Residuals 278 937668 3373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now take out avg_2021_median_income
new_model <- update(new_model, .~. -avg_2021_median_income)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_2024_rent
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_2023_rent 1 75032525 75032525 22150.36 < 2e-16 ***
## avg_2022_rent 1 62408 62408 18.42 2.4e-05 ***
## avg_2021_rent 1 67512 67512 19.93 1.2e-05 ***
## avg_2022_median_income 1 2757 2757 0.81 0.368
## avg_2016_property_taxes 1 37264 37264 11.00 0.001 **
## avg_2021_property_taxes 1 60204 60204 17.77 3.4e-05 ***
## Residuals 279 945090 3387
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Now take out avg_2022_median_income
new_model <- update(new_model, .~. -avg_2022_median_income)
anova(new_model)
## Analysis of Variance Table
##
## Response: avg_2024_rent
## Df Sum Sq Mean Sq F value Pr(>F)
## avg_2023_rent 1 75032525 75032525 22199.2 < 2e-16 ***
## avg_2022_rent 1 62408 62408 18.5 2.4e-05 ***
## avg_2021_rent 1 67512 67512 20.0 1.1e-05 ***
## avg_2016_property_taxes 1 39351 39351 11.6 0.00074 ***
## avg_2021_property_taxes 1 59574 59574 17.6 3.6e-05 ***
## Residuals 280 946390 3380
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Let’s call and save the final model we have come to.
final_US_fuller_model <- new_model
summary(final_US_fuller_model)
##
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent +
## avg_2021_rent + avg_2016_property_taxes + avg_2021_property_taxes,
## data = county_aggregated_df_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -211.6 -25.2 -4.8 15.6 393.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.8349 13.1349 1.21 0.22901
## avg_2023_rent 1.2222 0.0705 17.33 < 2e-16 ***
## avg_2022_rent -0.3261 0.0931 -3.50 0.00053 ***
## avg_2021_rent 0.1415 0.0453 3.12 0.00197 **
## avg_2016_property_taxes 0.0606 0.0125 4.84 2.1e-06 ***
## avg_2021_property_taxes -0.0472 0.0112 -4.20 3.6e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 58.1 on 280 degrees of freedom
## Multiple R-squared: 0.988, Adjusted R-squared: 0.987
## F-statistic: 4.45e+03 on 5 and 280 DF, p-value: <2e-16
Interpretation:
Get data from the Census buearu for each county from 2015-2022
library(tidycensus)
library(tidyverse)
#Requesting API key from the Census
census_api_key("8742799b6616820ce0d0a8dd45acd13c5b47fcc7", install = TRUE)
# Set Census API Key
readRenviron("~/.Renviron") # Ensure the API key is loaded
# Define years from 2015 to the most recent available ACS data
years <- 2015:2022
# Retrieve Median Household Income
income_data <- purrr::map_dfr(years, function(y) {
get_acs(
geography = "county",
variables = c(median_income = "B19013_001"),
year = y,
survey = "acs5"
) %>%
mutate(year = y)
})
# Retrieve Median Age Data
age_data <- purrr::map_dfr(years, function(y) {
get_acs(
geography = "county",
variables = c(median_age = "B01002_001"),
year = y,
survey = "acs5"
) %>%
mutate(year = y)
})
#Inner Joining the Two Sets
state_summary <- income_data %>%
left_join(age_data, by = c("GEOID", "NAME", "year")) %>%
group_by(GEOID, NAME, year) %>%
summarise(
median_income = mean(estimate.x, na.rm = TRUE),
median_age = mean(estimate.y, na.rm = TRUE),
.groups = "drop"
)
# Retrieve Property Tax Data
property_tax_data <- purrr::map_dfr(years, function(y) {
get_acs(
geography = "county",
variables = c(
property_taxes = "B25103_001" # Selected Monthly Owner Costs for Housing Units (includes property taxes)
),
year = y,
survey = "acs5"
) %>%
mutate(year = y)
})
state_taxes_summary <- property_tax_data %>%
group_by(GEOID, NAME, year) %>%
summarise(property_taxes = mean(estimate, na.rm = TRUE), .groups = "drop")
state_summary <- state_summary %>%
left_join(state_taxes_summary, by = c("GEOID", "NAME", "year"))
# Reshape data to have one row per county, separate columns for each year's median income & age
state_summary_wide <- state_summary %>%
pivot_wider(names_from = year,
values_from = c(median_income, median_age, property_taxes.x),
names_glue = "{.value}_{year}")
Fix the formatting of the income data to separate the state and county as well as add the stae abbreviation code.
county_data <- read.csv("data/state_summary_wide.csv")
library(tidyverse)
county_data <- county_data %>%
separate(NAME, into = c("county", "state"), sep = ", ")
state_abbreviations <- data.frame(
state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
"Delaware","District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa",
"Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan",
"Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma",
"Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
"Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"),
abbreviation = c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE","DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA",
"KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
"NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT",
"VA", "WA", "WV", "WI", "WY")
)
county_data <- county_data %>%
left_join(state_abbreviations, by = "state")
write.csv(county_data, "state_income_summary_updated.csv", row.names = FALSE)
Load in the data
zillow <- read.csv("data/City_zori_uc_sfrcondomfr_sm_month.csv")
county_geo <- read.csv("data/uscounties.csv")
county_data <- read.csv("data/state_income_summary_updated.csv")
Inner join the data frames
df <- inner_join(county_data, county_geo, by = c("GEOID" = "county_fips"))
colnames(df)[colnames(df) == "county.x"] <- "county_fullname"
df <- df %>%
mutate(county_state = paste(county_fullname, state_id))
zillow <- zillow %>%
mutate(county_state = paste(CountyName, StateName))
data <- inner_join(zillow, df, by = "county_state")
data_cleaned <- data %>% distinct(RegionID, RegionName, X2015.01.31, .keep_all = TRUE)
data_cleaned <- data_cleaned %>%
select(-c(RegionType, state_name, abbreviation, county.y, county_full, county_fullname, state_id, State))
colnames(data_cleaned)[colnames(data_cleaned) == "county_ascii"] <- "county"
write.csv(data_cleaned, "combined_zillow_project_data.csv", row.names = FALSE)